import nltk
nltk.data.path.append('C:/nltk_data')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import jieba  # 中文分词库


# 初始化nltk数据（首次运行需要下载）
nltk.download('punkt', download_dir='C:/nltk_data')
nltk.download('stopwords', download_dir='C:/nltk_data')

# ====================
# 文本预处理函数
# ====================
def preprocess_text(text, language='english'):
    # 去除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))

    if language == 'chinese':
        # 中文分词
        words = jieba.cut(text)
        # 加载中文停用词表
        stop_words = set(open('chinese_stopwords.txt', encoding='utf-8').read().split())
    else:
        # 英文处理
        text = text.lower()
        words = word_tokenize(text)
        stop_words = set(stopwords.words('english'))

    # 去除停用词和数字
    filtered_words = [word for word in words if
                      word not in stop_words
                      and not word.isdigit()
                      and len(word) > 1]
    return filtered_words


# ====================
# 示例文本
# ====================
sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, 
and artificial intelligence concerned with the interactions between computers 
and human language. It focuses on how to program computers to process and 
analyze large amounts of natural language data. Key tasks include text 
classification, sentiment analysis, machine translation, and speech recognition.
"""

# ====================
# 词频统计（TF）
# ====================
processed_words = preprocess_text(sample_text)
word_freq = Counter(processed_words)

# 输出前10高频词
print("Top 10 TF高频词:")
for word, freq in word_freq.most_common(10):
    print(f"{word}: {freq}")

# ====================
# TF-IDF计算（示例使用多个文档）
# ====================
documents = [
    "Natural language processing enables computers to understand human language.",
    "Machine learning is a key component of artificial intelligence.",
    "Text classification and sentiment analysis are common NLP tasks.",
    "Deep learning has revolutionized speech recognition systems."
]

# 计算TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

# 获取第一个文档的TF-IDF值
tfidf_values = tfidf_matrix[0].toarray().flatten()
tfidf_dict = {word: score for word, score in zip(feature_names, tfidf_values)}

print("\nTF-IDF示例（第一个文档）:")
for word, score in sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{word}: {score:.4f}")

# ====================
# 可视化（词云）
# ====================
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      colormap='viridis').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Frequency Cloud')
plt.show()

# ====================
# 可视化（柱状图）
# ====================
top_words = 10
words, frequencies = zip(*word_freq.most_common(top_words))

plt.figure(figsize=(12, 6))
plt.barh(range(len(words)), frequencies, color='skyblue')
plt.yticks(range(len(words)), words)
plt.gca().invert_yaxis()  # 最高频词显示在最上面
plt.xlabel('Frequency')
plt.title(f'Top {top_words} Most Frequent Words')
plt.tight_layout()
plt.show()
